Antonio Coín Castro

Bayesian Functional Linear Regression

In [1]:
# -- Libraries

from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import arviz as az
import numpy as np
from sklearn.svm import SVR
import pandas as pd
from IPython.display import display
import logging
import skfda
from skfda.preprocessing.dim_reduction.variable_selection import (
    RecursiveMaximaHunting as RMH
)
import os
from skfda.preprocessing.dim_reduction.feature_extraction import FPCA
from skfda.ml.regression import KNeighborsRegressor
from skfda.ml.regression import LinearRegression as FLinearRegression
from skfda.representation.basis import FDataBasis, Fourier, BSpline
from skfda.representation.grid import FDataGrid
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.exceptions import ConvergenceWarning
import sys
import pickle
import scipy
from multiprocessing import Pool
import utils
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
In [2]:
# -- Configuration

# Extensions
%load_ext autoreload
%autoreload 2

# Plotting configuration
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = [6, 4]
plt.style.use('arviz-darkgrid')
NCOLS = 3


def NROWS(x, ncols=NCOLS):
    return np.ceil(x/ncols).astype('int')


# Randomness and reproducibility
SEED = 42
np.random.seed(SEED)
rng = np.random.default_rng(SEED)

# Floating point precision for display
np.set_printoptions(precision=3, suppress=True)
pd.set_option("display.precision", 3)

# Multiprocessing
N_CORES = 4

# Ignore warnings
np.seterr(over='ignore', divide='ignore')
os.environ["PYTHONWARNINGS"] = 'ignore::UserWarning'

We consider the model

$$ Y = \alpha_0 + \Psi^{-1}_{X_i}(\alpha) + \varepsilon_i, $$

i.e.,

$$ Y_i\mid X_i=x_i \sim \mathcal N\left(\alpha_0 + \sum_{j=1}^p \beta_jx_i(\tau_j), \ \sigma^2\right). $$

The prior distributions we choose are:

\begin{align*} \pi(\alpha_0, \sigma^2) & \propto 1/\sigma^2, \\ \tau & \sim \mathscr U([0, 1]^p), \\ \beta\mid \tau, \sigma^2 & \sim \mathcal N\left(b_0, g\sigma^2\left[\mathcal X_\tau' \mathcal X_\tau + \eta \lambda_{\text{max}}(\mathcal X_\tau' \mathcal X_\tau)\right]^{-1}\right), \end{align*}

Note that for computational reasons we will work with $\log \sigma$ instead of $\sigma^2$, and hence the associated prior distribution is

$$ \pi(\alpha_0, \log\sigma) \propto 1. $$

Writing the parameter vector as $\theta = (\beta, \tau, \alpha_0, \log\sigma)$, the joint posterior probability is:

$$ \pi(\beta, \tau, \alpha_0, \log\sigma\mid Y) \propto \frac{|G_\tau|^{1/2}}{\sigma^{p+n}} \exp\left\{ -\frac{1}{2\sigma^2} \left(\|Y- \alpha_0\boldsymbol{1} - \mathcal X_\tau\beta\|^2 + \frac{1}{g}(\beta - b_0)'G_\tau(\beta - b_0) \right) \right\}. $$

Hence, the log-posterior probability is:

$$ \log \pi(\beta, \tau, \alpha_0, \log\sigma\mid Y) \propto \frac{1}{2}\log |G_\tau| - (p+n)\log \sigma -\frac{1}{2\sigma^2} \left(\|Y-\alpha_0\boldsymbol{1} - \mathcal X_\tau\beta\|^2 + \frac{1}{g}(\beta - b_0)'G_\tau(\beta - b_0) \right). $$

The metrics considered for model evaluation will be:

  • RMSE.
  • $R^2$.

Example dataset

We generate a toy dataset with $n=100$ functional regressors $X_i(t) \sim GP(0, K(s, t))$, a response variable given by either a $L^2$ model or a "simple" RKHS function, a value of $\alpha_0=5$ and a variance of $\sigma^2=0.5$. More precisely, we choose one of

$$ Y_i \sim \mathcal N\big(5 -5X_i(0.1) + 10X_i(0.8), \ 0.5\big) $$

or

$$ Y_i \sim \mathcal N\left(5 + \int_0^1 \beta(t)X_i(t)\, dt, \ 0.5\right), $$

where $\beta(t) \in L^2[0, 1]$.

We consider a regular grid of $N=100$ points on $[0, 1]$. In addition, we center the $X_i$ so that they have zero mean when fed to the sampling algorithms.

We also generate a test dataset with $n_{\text{test}}=50$ regressors for model evaluation.

In [3]:
# -- Data generation parameters

SYNTHETIC_DATA = True
MODEL_GEN = "RKHS"  # 'L2' or 'RKHS'
REAL_DATA = "Aemet"
STANDARDIZE_PREDICTORS = False
STANDARDIZE_RESPONSE = False
BASIS_REPRESENTATION = False
N_BASIS = 11

kernel_fn = utils.squared_exponential_kernel
beta_coef = utils.cholaquidis_scenario3
basis = Fourier(n_basis=N_BASIS)
In [4]:
# -- Dataset generation

if SYNTHETIC_DATA:
    n_train, n_test = 100, 50
    N = 100
    grid = np.linspace(1./N, 1., N)

    beta_true = np.array([-5., 10.])
    tau_true = np.array([0.1, 0.8])
    alpha0_true = 5.
    sigma2_true = 0.5

    if MODEL_GEN == "L2":
        x, y = utils.generate_gp_l2_dataset(
            grid, kernel_fn,
            n_train + n_test, beta_coef, alpha0_true,
            sigma2_true, rng=rng
        )
    elif MODEL_GEN == "RKHS":
        x, y = utils.generate_gp_rkhs_dataset(
            grid, kernel_fn,
            n_train + n_test, beta_true, tau_true,
            alpha0_true, sigma2_true, rng=rng
        )
    else:
        raise ValueError("Invalid model generation strategy.")

    # Train/test split
    X, X_test, Y, Y_test = train_test_split(
        x, y, train_size=n_train, random_state=SEED)

    # Create FData object
    X_fd = skfda.FDataGrid(X, grid)
    X_test_fd = skfda.FDataGrid(X_test, grid)

else:
    if REAL_DATA == "Tecator":
        x, y = skfda.datasets.fetch_tecator(return_X_y=True)
        y = np.sqrt(y[:, 1])  # Sqrt-Fat
    elif REAL_DATA == "Aemet":
        data = skfda.datasets.fetch_aemet()['data']
        data_matrix = data.data_matrix
        temperature = data_matrix[:, :, 0]
        x = FDataGrid(temperature, data.grid_points)
        # Log-Sum of log-precipitation for each station
        y = np.log(np.exp(data_matrix[:, :, 1]).sum(axis=1))
    else:
        raise ValueError("REAL_DATA must be 'Tecator' or 'Aemet'.")

    X_fd, X_test_fd, Y, Y_test = train_test_split(
        x, y, train_size=0.8, random_state=SEED)

    N = len(X_fd.grid_points[0])
    grid = np.linspace(1./N, 1., N)  # TODO: use (normalized) real grid
    n_train, n_test = len(X_fd.data_matrix), len(X_test_fd.data_matrix)

if BASIS_REPRESENTATION:
    X_fd = X_fd.to_basis(basis).to_grid(X_fd.grid_points[0])
    X_test_fd = X_test_fd.to_basis(basis).to_grid(X_fd.grid_points[0])

if STANDARDIZE_PREDICTORS:
    X_sd = X_fd.data_matrix.std(axis=0)
else:
    X_sd = np.ones(X_fd.data_matrix.shape[1:])

if STANDARDIZE_RESPONSE:
    Y_m = Y.mean()
    Y_sd = Y.std()
else:
    Y_m = 0.0
    Y_sd = 1.0

# Standardize data
X_m = X_fd.mean(axis=0)
X_fd = (X_fd - X_m)/X_sd
X = X_fd.data_matrix.reshape(-1, N)
X_test_fd = (X_test_fd - X_m)/X_sd
X_test = X_test_fd.data_matrix.reshape(-1, N)
Y = (Y - Y_m)/Y_sd
Y_test = (Y_test - Y_m)/Y_sd

utils.plot_dataset(
    X, Y, n_samples=n_train if not SYNTHETIC_DATA else n_train//2)

Common model hyperparameters

In our algorithms, we consider an unconstrained tranformed parameter space $\tilde \Theta=\mathbb{R}^{2\hat p+2}$ via the bijections

  • $\tau_j \mapsto \operatorname{logit}(\tau_j)$.
  • $\sigma^2 \mapsto \log\sigma$.
In [5]:
# -- Model hyperparameters

p_hat = 3
g = 5
eta = 1.0

TRANSFORM_TAU = False
FIT_SK = True
In [6]:
# -- Names and labels

# Names of parameters
theta_names = ["β", "τ", "α0", "σ2"]
if TRANSFORM_TAU:
    theta_names_ttr = ["β", "logit τ", "α0", "log σ"]
else:
    theta_names_ttr = ["β", "τ", "α0", "log σ"]
theta_names_aux = ["α0 and log σ"]

# Grouped labels
theta_labels_grouped = [r"$\beta$", r"$\tau$", r"$\alpha_0$", r"$\sigma^2$"]

# Individual labels
theta_labels = []
for i in range(p_hat):
    theta_labels.append(fr"$\beta_{i + 1}$")
for i in range(p_hat):
    theta_labels.append(fr"$\tau_{i + 1}$")
theta_labels.append(theta_labels_grouped[-2])
theta_labels.append(theta_labels_grouped[-1])

# Labels for Arviz
theta_labeller = az.labels.MapLabeller(
    var_name_map=dict(zip(theta_names[-2:], theta_labels_grouped[-2:])),
    coord_map={"projection": dict(
        zip(np.arange(p_hat), np.arange(1, p_hat + 1)))}
)

# Dimension of parameter vector
theta_ndim = len(theta_labels)

# Dimension of grouped parameter vector
theta_ndim_grouped = len(theta_names)

# Names of results columns
results_columns = ["Estimator", "Features", "MSE", "RMSE", r"$R^2$"]
In [7]:
# -- Parameter space and miscellaneous

if TRANSFORM_TAU:
    tau_ttr = utils.Logit()
else:
    tau_ttr = utils.Identity()

# Parameter space
theta_space = utils.ThetaSpace(
    p_hat, grid, theta_names, theta_names_ttr, theta_labels, tau_ttr=tau_ttr)

# Statistics for posterior predictive checks
statistics = [
    ("min", np.min),
    ("max", np.max),
    ("median", np.median),
    ("mean", np.mean),
    ("std", np.std)]

# Point estimates for posterior distribution
point_estimates = ["mode", "mean", "median"]

Sklearn model comparison

Currently there is a bug when trying to fit an already fitted FPCA object. It checks "if not self.weights" when it should be checking "if self.weights is None" (weights is an ndarray).

In [8]:
# -- Custom CV and transformers

def cv_sk(regressors, folds, X, Y, X_test, Y_test, verbose=False):
    df_metrics_sk = pd.DataFrame(columns=results_columns)

    for i, (name, pipe, params) in enumerate(regressors):
        if verbose:
            print(f"Fitting {name}...")
        reg_cv = GridSearchCV(pipe, params, scoring="neg_mean_squared_error",
                              n_jobs=N_CORES, cv=folds)
        reg_cv.fit(X, Y)
        Y_hat_sk = reg_cv.predict(X_test)
        metrics_sk = utils.regression_metrics(Y_test, Y_hat_sk)

        if name == "sk_fknn":
            n_features = f"K={reg_cv.best_params_['reg__n_neighbors']}"
        elif "svm" in name:
            n_features = reg_cv.best_estimator_["reg"].n_features_in_
        else:
            if isinstance(reg_cv.best_estimator_["reg"].coef_[0], FDataBasis):
                coef = reg_cv.best_estimator_["reg"].coef_[0].coefficients[0]
            else:
                coef = reg_cv.best_estimator_["reg"].coef_

            n_features = sum(~np.isclose(coef, 0))

        df_metrics_sk.loc[i] = [
            name,
            n_features,
            metrics_sk["mse"],
            metrics_sk["rmse"],
            metrics_sk["r2"]]

        df_metrics_sk.sort_values(results_columns[-2], inplace=True)

    return df_metrics_sk


def bayesian_var_sel(idata, theta_space, names,
                     X, Y, X_test, Y_test, folds,
                     prefix, point_est='mode',
                     verbose=False):
    grid = theta_space.grid
    p_hat = theta_space.p
    tau_hat = utils.point_estimate(
        idata, point_est, names)[p_hat:2*p_hat]
    idx_hat = np.abs(grid - tau_hat[:, np.newaxis]).argmin(1)

    regressors_var_sel = []
    alphas = np.logspace(-4, 4, 20)
    params_reg = {"reg__alpha": alphas}
    params_svm = {"reg__C": alphas,
                  "reg__gamma": ['auto', 'scale']}

    # Emcee+Lasso
    regressors_var_sel.append((f"{prefix}_{point_est}+sk_lasso",
                              Pipeline([
                                  ("var_sel", VariableSelection(grid, idx_hat)),
                                  ("data_matrix", DataMatrix()),
                                  ("reg", Lasso())]),
                              params_reg
                               ))

    # Emcee+Ridge
    regressors_var_sel.append((f"{prefix}_{point_est}+sk_ridge",
                              Pipeline([
                                  ("var_sel", VariableSelection(grid, idx_hat)),
                                  ("data_matrix", DataMatrix()),
                                  ("reg", Ridge())]),
                              params_reg
                               ))

    # Emcee+SVM RBF
    regressors_var_sel.append((f"{prefix}_{point_est}+sk_svm_rbf",
                              Pipeline([
                                  ("var_sel", VariableSelection(grid, idx_hat)),
                                  ("data_matrix", DataMatrix()),
                                  ("reg", SVR(kernel='rbf'))]),
                              params_svm
                               ))

    df_metrics_var_sel = cv_sk(regressors_var_sel, folds,
                               X, Y, X_test, Y_test, verbose)

    return df_metrics_var_sel


class FeatureSelector(BaseEstimator, TransformerMixin):

    def __init__(self, p=1):
        self.p = p

    def fit(self, X, y=None):
        N = X.shape[1]
        self.idx_ = np.linspace(0, N - 1, self.p).astype(int)
        return self

    def transform(self, X, y=None):
        return X[:, self.idx_]


class DataMatrix(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        self.N = len(X.grid_points[0])
        return self

    def transform(self, X, y=None):
        return X.data_matrix.reshape(-1, self.N)


class Basis(BaseEstimator, TransformerMixin):

    def __init__(self, basis=Fourier()):
        self.basis = basis

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.to_basis(self.basis)


class VariableSelection(BaseEstimator, TransformerMixin):

    def __init__(self, grid=None, idx=None):
        self.grid = grid
        self.idx = idx
        self.idx.sort()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return FDataGrid(X.data_matrix[:, self.idx], self.grid[self.idx])
In [9]:
# -- Select family of regressors

regressors = []

alphas = np.logspace(-4, 4, 20)
n_selected = [5, 10, 15, 20, 25, X.shape[1]]
n_components = [2, 3, 4, 5, 6, 10]
n_basis_bsplines = [8, 10, 12, 14, 16]
n_basis_fourier = [3, 5, 7, 9, 11]
basis_bspline = [BSpline(n_basis=p) for p in n_basis_bsplines]
basis_fourier = [Fourier(n_basis=p) for p in n_basis_fourier]
n_neighbors = [3, 5, 7]

params_reg = {"reg__alpha": alphas}
params_svm = {"reg__C": alphas,
              "reg__gamma": ['auto', 'scale']}
params_select = {"selector__p": n_selected}
params_fpca = {"dim_red__n_components": n_components}
params_basis = {"basis__basis": basis_bspline + basis_fourier}
params_knn = {"reg__n_neighbors": n_neighbors,
              "reg__weights": ['uniform', 'distance']}

# Lasso
regressors.append(("sk_lasso",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("reg", Lasso())]),
                   params_reg
                   ))

# FPCA+Ridge
regressors.append(("fpca+sk_ridge",
                   Pipeline([
                       ("dim_red", FPCA()),  # Retains scores only
                       ("reg", Ridge())]),
                   {**params_fpca, **params_reg}
                   ))

# Manual+Ridge
regressors.append(("manual_sel+sk_ridge",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("selector", FeatureSelector()),
                       ("reg", Ridge())]),
                   {**params_reg, **params_select}
                   ))

# RMH+Ridge
regressors.append(("rmh+sk_ridge",
                   Pipeline([
                       ("var_sel", RMH()),
                       ("reg", Ridge())]),
                   {}
                   ))

# FPCA+Lin
regressors.append(("fpca+sk_lin",
                   Pipeline([
                       ("dim_red", FPCA()),  # Retains scores only
                       ("reg", LinearRegression())]),
                   params_fpca
                   ))


# FPCA+SVM RBF
regressors.append(("fpca+sk_svm_rbf",
                   Pipeline([
                       ("dim_red", FPCA()),  # Retains scores only
                       ("reg", SVR(kernel='rbf'))]),
                   {**params_fpca, **params_svm}
                   ))

# Manual+SVM RBF
regressors.append(("manual_sel+sk_svm_rbf",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("selector", FeatureSelector()),
                       ("reg", SVR(kernel='rbf'))]),
                   {**params_select, **params_svm}
                   ))

# RMH+SVM RBF
regressors.append(("rmh+sk_svm_rbf",
                   Pipeline([
                       ("var_sel", RMH()),
                       ("reg", SVR(kernel='rbf'))]),
                   params_svm
                   ))

# Functional Linear Regression
regressors.append(("sk_flin",
                   Pipeline([
                       ("basis", Basis()),
                       ("reg", FLinearRegression())]),
                   params_basis
                   ))

# KNeighbors Functional Regression
regressors.append(("sk_fknn",
                   Pipeline([
                       ("reg", KNeighborsRegressor())]),
                   params_knn
                   ))
In [10]:
# -- Fit models and show metrics

folds = KFold(shuffle=True, random_state=SEED)

if FIT_SK:
    df_metrics_sk = cv_sk(regressors, folds, X_fd, Y,
                          X_test_fd, Y_test, verbose=True)
    display(df_metrics_sk.style.hide_index())
Fitting sk_lasso...
Fitting fpca+sk_ridge...
Fitting manual_sel+sk_ridge...
Fitting rmh+sk_ridge...
Fitting fpca+sk_lin...
Fitting fpca+sk_svm_rbf...
Fitting manual_sel+sk_svm_rbf...
Fitting rmh+sk_svm_rbf...
Fitting sk_flin...
Fitting sk_fknn...
Estimator Features MSE RMSE $R^2$
sk_lasso 22 0.529 0.727 0.996
manual_sel+sk_ridge 100 0.556 0.746 0.996
fpca+sk_lin 6 0.558 0.747 0.996
fpca+sk_ridge 6 0.558 0.747 0.996
sk_flin 7 0.566 0.753 0.996
rmh+sk_ridge 2 1.043 1.021 0.993
manual_sel+sk_svm_rbf 100 10.067 3.173 0.933
rmh+sk_svm_rbf 2 11.467 3.386 0.924
fpca+sk_svm_rbf 6 11.555 3.399 0.923
sk_fknn K=5 12.874 3.588 0.914

Maximum Likelihood Estimator

In [11]:
# -- Negative log-likelihood definition in transformed parameter space

def neg_ll(theta_tr, X, Y, theta_space):
    """Transformed parameter vector 'theta_tr' is (β, τ, α0, log σ)."""

    n, N = X.shape
    grid = theta_space.grid

    assert len(theta_tr) == theta_space.ndim

    theta = theta_space.backward(theta_tr)
    beta, tau, alpha0, sigma2 = theta_space.get_params(theta)
    log_sigma = theta_space.get_sigma2(theta_tr)

    idx = np.abs(grid - tau[:, np.newaxis]).argmin(1)
    X_tau = X[:, idx]

    return -(-n*log_sigma
             - np.linalg.norm(Y - alpha0 - X_tau@beta)**2/(2*sigma2))
In [12]:
# -- MLE estimation

method_mle = 'L-BFGS-B'  # 'Nelder-Mead', 'Powell' or 'L-BFGS-B'
strategy_mle = 'global'

theta_init = theta_space.forward(
    np.array([0.0]*p_hat + [0.5]*p_hat + [Y.mean()] + [1.0]))

if TRANSFORM_TAU:
    bounds = None
else:
    bounds = ([(None, None)]*p_hat
              + [(theta_space.tau_lb, theta_space.tau_ub)]*p_hat
              + [(None, None)]
              + [(None, None)])

if strategy_mle == 'local':
    mle_theta_tr = scipy.optimize.minimize(
        neg_ll,
        x0=theta_init,
        bounds=bounds,
        method=method_mle,
        args=(X, Y, theta_space)
    ).x
    bic = utils.compute_bic(theta_space, neg_ll, mle_theta_tr, X, Y)
elif strategy_mle == 'global':
    mles = np.zeros((N_CORES, theta_space.ndim))

    def optimizer(rng):
        return scipy.optimize.basinhopping(
            neg_ll,
            x0=theta_init,
            seed=rng,
            minimizer_kwargs={"args": (X, Y, theta_space),
                              "method": method_mle,
                              "bounds": bounds}
        ).x

    with Pool(N_CORES) as p:
        print(f"-- Computing MLE with {N_CORES} independent runs --")
        rngs = [np.random.default_rng(SEED + i) for i in range(N_CORES)]
        mles = p.map(optimizer, rngs)
        bics = utils.bic = utils.compute_bic(theta_space, neg_ll, mles, X, Y)
        mle_theta_tr = mles[np.argmin(bics)]
        bic = bics[np.argmin(bics)]
else:
    raise ValueError('Invalid strategy for MLE computation.')

mle_theta = theta_space.backward(mle_theta_tr)
Y_hat_mle = utils.generate_response(X_test, mle_theta, noise=False)
df_metrics_mle = pd.DataFrame(columns=results_columns)
metrics_mle = utils.regression_metrics(Y_test, Y_hat_mle)
df_metrics_mle.loc[0] = [
    "mle",
    p_hat,
    metrics_mle["mse"],
    metrics_mle["rmse"],
    metrics_mle["r2"]
]

print(f"\nBIC: {bic:.3f}")
print("MLE:")
display(pd.DataFrame(zip(theta_space.labels, mle_theta),
                     columns=["", "MLE"]).style.hide_index())
print("Regression metrics:")
df_metrics_mle.style.hide_index()
-- Computing MLE with 4 independent runs --

BIC: 96.259
MLE:
MLE
$\beta_1$ 10.121
$\beta_2$ -4.928
$\beta_3$ 0.012
$\tau_1$ 0.807
$\tau_2$ 0.103
$\tau_3$ 0.411
$\alpha_0$ 5.272
$\sigma^2$ 0.666
Regression metrics:
Out[12]:
Estimator Features MSE RMSE $R^2$
mle 3 0.790 0.889 0.995

The Ensemble Sampler and the emcee library

In [13]:
import emcee

Model

We only need to provide the sampler with the logarithm of the posterior distribution. For clarity we split up its computation in log-prior and log-likelihood, although for a more efficient implementation it should all be in one function.

In [14]:
# -- Log-posterior model

def log_prior(theta_tr):
    """Global parameters (for efficient parallelization): 
        X, b0, g, eta, theta_space"""
    assert len(theta_tr) == theta_space.ndim

    n, N = X.shape
    p = theta_space.p
    grid = theta_space.grid

    theta = theta_space.backward(theta_tr)
    beta, tau, alpha0, sigma2 = theta_space.get_params(theta)
    log_sigma = theta_space.get_sigma2(theta_tr)

    if not TRANSFORM_TAU:
        if (tau < theta_space.tau_lb).any() or (tau > theta_space.tau_ub).any():
            return -np.inf

    # Transform variables
    b = beta - b0

    # Compute and regularize G_tau
    idx = np.abs(grid - tau[:, np.newaxis]).argmin(1)
    X_tau = X[:, idx]
    G_tau = X_tau.T@X_tau
    G_tau = (G_tau + G_tau.T)/2.  # Enforce symmetry
    G_tau_reg = G_tau + eta * \
        np.max(np.linalg.eigvalsh(G_tau))*np.identity(p)

    # Compute log-prior
    log_prior = (0.5*utils.logdet(G_tau_reg)
                 - p*log_sigma
                 - b.T@G_tau_reg@b/(2*g*sigma2))

    return log_prior


def log_likelihood(theta_tr, Y):
    """Global parameters (for efficient parallelization): 
        X, theta_space, return_ll"""
    n, N = X.shape
    grid = theta_space.grid

    assert len(theta_tr) == theta_space.ndim

    theta = theta_space.backward(theta_tr)
    beta, tau, alpha0, sigma2 = theta_space.get_params(theta)
    log_sigma = theta_space.get_sigma2(theta_tr)

    idx = np.abs(grid - tau[:, np.newaxis]).argmin(1)
    X_tau = X[:, idx]

    ll = (-n*log_sigma
          - np.linalg.norm(Y - alpha0 - X_tau@beta)**2/(2*sigma2))

    if return_ll:
        # Add constant term so that it is the genuine log-probability
        ll_pointwise = (-log_sigma - 0.5*np.log(2*np.pi)
                        - (Y - alpha0 - X_tau@beta)**2/(2*sigma2))
        return ll, ll_pointwise
    else:
        return ll


def log_posterior(theta_tr, Y):
    """Global parameters (for efficient parallelization): 
        X, rng, return_pp, return_ll, theta_space"""
    # Compute log-prior
    lp = log_prior(theta_tr)

    if not np.isfinite(lp):
        if return_pp and return_ll:
            return -np.inf, np.full_like(Y, -np.inf), np.full_like(Y, -np.inf)
        elif return_pp:
            return -np.inf, np.full_like(Y, -np.inf)
        elif return_ll:
            return -np.inf, np.full_like(Y, -np.inf)
        else:
            return -np.inf

    # Compute log-likelihood (and possibly pointwise log-likelihood)
    if return_ll:
        ll, ll_pointwise = log_likelihood(theta_tr, Y)
    else:
        ll = log_likelihood(theta_tr, Y)

    # Compute log-posterior
    lpos = lp + ll

    # Compute posterior predictive samples
    if return_pp:
        theta = theta_space.backward(theta_tr)
        pp = utils.generate_response(X, theta, rng=rng)

    # Return information
    if return_pp and return_ll:
        return lpos, pp, ll_pointwise
    elif return_pp:
        return lpos, pp
    elif return_ll:
        return lpos, ll_pointwise
    else:
        return lpos

Experiments

We set up the initial points of the chains to be in a random neighbourhood around the MLE to increase the speed of convergence.

In [15]:
def run_emcee(n_walkers, n_iter_initial, n_iter, moves,
              thin, thin_pp, return_pp, return_ll):
    # -- Run sampler

    with Pool(N_CORES) as pool:
        print(
            f"-- Running affine-invariant ensemble sampler with {N_CORES} cores --")

        sampler = emcee.EnsembleSampler(
            n_walkers, theta_ndim, log_posterior,
            pool=pool, args=(Y,),
            moves=moves)

        print("Tuning phase...")
        state = sampler.run_mcmc(
            p0, n_iter_initial, progress='notebook',
            store=False)
        sampler.reset()

        print("MCMC sampling...")
        sampler.run_mcmc(state, n_iter, progress='notebook')

    print(
        f"Mean acceptance fraction: {100*np.mean(sampler.acceptance_fraction):.3f}%")

    logging.disable(sys.maxsize)  # Disable logger

    # Analyze autocorrelation and set burn-in and thinning values
    autocorr = sampler.get_autocorr_time(quiet=True)
    max_autocorr = np.max(autocorr)
    if (np.isfinite(autocorr)).all():
        burn = int(3*max_autocorr)
    else:
        print("Some autocorrelation value is not finite")
        burn = 500

    logging.disable(logging.NOTSET)  # Re-enable logger

    # Get InferenceData object
    idata_emcee = utils.emcee_to_idata(
        sampler, theta_space, burn, thin,
        ["y_star"] if return_pp else [], return_ll)

    print("\n-- Summary statistics --")
    display(utils.summary(idata_emcee, var_names=theta_names,
                          kind="stats", labeller=theta_labeller))

    # -- Compute metrics using several point estimates

    df_metrics_emcee = pd.DataFrame(columns=results_columns)

    # Posterior mean estimate
    pp_test = utils.generate_pp(
        idata_emcee, X_test, theta_names, thin_pp, rng=rng)

    print("Computing metrics...", end="\r")

    Y_hat_pp = pp_test.mean(axis=(0, 1))
    metrics_pp = utils.regression_metrics(Y_test, Y_hat_pp)
    df_metrics_emcee.loc[0] = [
        "emcee_posterior_mean",
        p_hat,
        metrics_pp["mse"],
        metrics_pp["rmse"],
        metrics_pp["r2"]
    ]

    # Point estimates
    for i, pe in enumerate(point_estimates):
        Y_hat_pe = utils.point_predict(
            X_test, idata_emcee,
            theta_names, pe)
        metrics_pe = utils.regression_metrics(Y_test, Y_hat_pe)
        df_metrics_emcee.loc[i + 1] = [
            "emcee_" + pe,
            p_hat,
            metrics_pe["mse"],
            metrics_pe["rmse"],
            metrics_pe["r2"]
        ]

    # Bayesian variable selection
    for pe in point_estimates:
        df_metrics_var_sel = bayesian_var_sel(
            idata_emcee, theta_space, theta_names, X_fd,
            Y, X_test_fd, Y_test, folds, prefix="emcee",
            point_est=pe)

        df_metrics_emcee = df_metrics_emcee.append(df_metrics_var_sel)

    if FIT_SK:
        df_metrics_emcee = df_metrics_emcee.append(df_metrics_sk)
    df_metrics_emcee = df_metrics_emcee.append(df_metrics_mle)
    df_metrics_emcee.sort_values(results_columns[-2], inplace=True)

    print("-- Regression metrics --")
    display(df_metrics_emcee.style.hide_index())

    return sampler, idata_emcee, df_metrics_emcee
In [16]:
# -- Sampler parameters

n_walkers = 64
n_iter_initial = 100
n_iter = 1000
return_pp = True
return_ll = True
frac_random = 0.3

sd_beta_init = 1.0
sd_tau_init = 0.2
mean_alpha0_init = Y.mean()
sd_alpha0_init = 1.0
param_sigma2_init = 2.0  # shape parameter in inv_gamma distribution
sd_sigma2_init = 1.0

moves = [
    (emcee.moves.StretchMove(), 0.7),
    (emcee.moves.WalkMove(), 0.3)
]

thin = 1
thin_pp = 5

FAST_RUN = True
In [17]:
# -- Run sampler

# Start every walker in a (random) neighbourhood around the MLE
p0 = utils.weighted_initial_guess_around_value(
    theta_space, mle_theta_tr, sd_beta_init, sd_tau_init,
    mean_alpha0_init, sd_alpha0_init, param_sigma2_init,
    sd_sigma2_init, n_walkers=n_walkers, rng=rng,
    frac_random=frac_random)

b0 = mle_theta_tr[theta_space.beta_idx]

if FAST_RUN:
    sampler, idata_emcee, df_metrics_emcee_full = run_emcee(
        n_walkers, n_iter_initial, n_iter, moves,
        thin, thin_pp, return_pp, return_ll)
else:
    with Pool(N_CORES) as pool:
        print(
            f"-- Running affine-invariant ensemble sampler with {N_CORES} cores --")

        sampler = emcee.EnsembleSampler(
            n_walkers, theta_ndim, log_posterior,
            pool=pool, args=(Y,),
            moves=moves)

        print("Tuning phase...")
        state = sampler.run_mcmc(
            p0, n_iter_initial, progress='notebook',
            store=False)
        sampler.reset()

        print("MCMC sampling...")
        sampler.run_mcmc(state, n_iter, progress='notebook')

    print(
        f"Mean acceptance fraction: {100*np.mean(sampler.acceptance_fraction):.3f}%")
-- Running affine-invariant ensemble sampler with 4 cores --
Tuning phase...
  0%|          | 0/100 [00:00<?, ?it/s]
MCMC sampling...
  0%|          | 0/1000 [00:00<?, ?it/s]
Mean acceptance fraction: 26.050%

-- Summary statistics --
mean sd hdi_3% hdi_97% mode median
β[0] 9.969 1.320 10.000 10.260 10.101 10.135
β[1] -4.819 0.709 -5.041 -4.767 -4.891 -4.906
β[2] 0.057 0.653 -0.196 0.140 -0.042 -0.026
τ[0] 0.788 0.096 0.795 0.805 0.801 0.800
τ[1] 0.114 0.109 0.095 0.105 0.104 0.100
τ[2] 0.509 0.277 0.015 0.931 0.431 0.502
$\alpha_0$ 5.246 0.210 5.134 5.398 5.269 5.268
$\sigma^2$ 2.579 17.177 0.313 0.557 0.430 0.428
Posterior predictive samples:   0%|          | 0/64 [00:00<?, ?it/s]
-- Regression metrics --
Estimator Features MSE RMSE $R^2$
emcee_mode+sk_lasso 3 0.507 0.712 0.997
emcee_mode+sk_ridge 3 0.507 0.712 0.997
emcee_median 3 0.511 0.715 0.997
emcee_mode 3 0.512 0.716 0.997
emcee_median+sk_lasso 3 0.516 0.718 0.997
emcee_median+sk_ridge 3 0.516 0.718 0.997
emcee_posterior_mean 3 0.525 0.725 0.997
sk_lasso 22 0.529 0.727 0.996
manual_sel+sk_ridge 100 0.556 0.746 0.996
fpca+sk_lin 6 0.558 0.747 0.996
fpca+sk_ridge 6 0.558 0.747 0.996
emcee_mean+sk_lasso 3 0.560 0.749 0.996
emcee_mean+sk_ridge 3 0.561 0.749 0.996
sk_flin 7 0.566 0.753 0.996
emcee_mean 3 0.761 0.872 0.995
mle 3 0.790 0.889 0.995
rmh+sk_ridge 2 1.043 1.021 0.993
emcee_median+sk_svm_rbf 3 7.492 2.737 0.950
emcee_mean+sk_svm_rbf 3 10.028 3.167 0.933
manual_sel+sk_svm_rbf 100 10.067 3.173 0.933
emcee_mode+sk_svm_rbf 3 10.352 3.217 0.931
rmh+sk_svm_rbf 2 11.467 3.386 0.924
fpca+sk_svm_rbf 6 11.555 3.399 0.923
sk_fknn K=5 12.874 3.588 0.914

Analysis

We analyze the samples of all chains, discarding a few times the integrated autocorrelation times worth of samples. We could also perform thinning and take only every $k$-th value.

In [18]:
# -- Sampler statistics and trace (with burn-in and thinning)

logging.disable(sys.maxsize)  # Disable logger

# Analyze autocorrelation and set burn-in and thinning values
autocorr = sampler.get_autocorr_time(quiet=True)
max_autocorr = np.max(autocorr)
if (np.isfinite(autocorr)).all():
    burn = int(3*max_autocorr)
else:
    print("Some autocorrelation value is not finite")
    burn = 500

# Get trace of samples
trace_flat = utils.get_trace_emcee(sampler, theta_space, burn, thin, flat=True)

# Get InferenceData object
idata_emcee = utils.emcee_to_idata(
    sampler, theta_space, burn, thin,
    ["y_star"] if return_pp else [], return_ll)

# Update and show autocorrelation
autocorr_thin = sampler.get_autocorr_time(discard=burn, thin=thin, quiet=True)

logging.disable(logging.NOTSET)  # Re-enable logger

pd.DataFrame(
    zip(theta_labels, autocorr_thin, len(trace_flat)/autocorr_thin),
    columns=["", "Autocorrelation times", "Effective i.i.d samples"]
).style.hide_index()
Out[18]:
Autocorrelation times Effective i.i.d samples
$\beta_1$ 49.702 950.298
$\beta_2$ 51.113 924.076
$\beta_3$ 48.669 970.483
$\tau_1$ 52.123 906.158
$\tau_2$ 49.478 954.599
$\tau_3$ 55.748 847.236
$\alpha_0$ 42.702 1106.079
$\sigma^2$ 46.640 1012.697
In [19]:
utils.summary(idata_emcee, var_names=theta_names,
              kind="stats", labeller=theta_labeller)
Out[19]:
mean sd hdi_3% hdi_97% mode median
β[0] 9.969 1.320 10.000 10.260 10.101 10.135
β[1] -4.819 0.709 -5.041 -4.767 -4.891 -4.906
β[2] 0.057 0.653 -0.196 0.140 -0.042 -0.026
τ[0] 0.788 0.096 0.795 0.805 0.801 0.800
τ[1] 0.114 0.109 0.095 0.105 0.104 0.100
τ[2] 0.509 0.277 0.015 0.931 0.431 0.502
$\alpha_0$ 5.246 0.210 5.134 5.398 5.269 5.268
$\sigma^2$ 2.579 17.177 0.313 0.557 0.430 0.428
In [20]:
az.plot_trace(idata_emcee, labeller=theta_labeller,
              combined=True, var_names=theta_names)
print("Combined density and trace plot:")
Combined density and trace plot:
In [21]:
az.plot_posterior(idata_emcee, labeller=theta_labeller, point_estimate='mode',
                  grid=(NROWS(theta_ndim), NCOLS), textsize=20,
                  var_names=theta_names)
print("Marginal posterior distributions:")
Marginal posterior distributions:

We can perform a couple of visual posterior predictive checks. In particular:

  • A plot of the distribution of $Y$ and the distribution of $\{Y^*_m\}_m$ (one for every sample $\theta_m$ of the chain), generated using the original traning data $X$.
  • A plot of the distribution of $T(Y^*)$, where $T(x)=\bar x$.

We also show the Bayesian p-value for several statistics, which is defined as $P(T(y^*)\leq T(y)\mid y)$, and is computed by simply measuring the proportion of generated samples $\{T(Y^*_m)\}_m$ that fall below the real value of the statistic. It is expected to be around $0.5$ when the model accurately represents the data.

In [22]:
# -- Generate and plot posterior predictive checks from X

if "posterior_predictive" not in idata_emcee:
    pp = utils.generate_pp(idata_emcee, X, theta_names, rng=rng)
    utils.pp_to_idata([pp], idata_emcee, ['y_star'], merge=True)
else:
    pp = idata_emcee.posterior_predictive['y_star'].to_numpy()

# Posterior predictive checks
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(r"Posterior predictive checks on $X$")

utils.plot_ppc(idata_emcee, n_samples=500, ax=axs[0],
               data_pairs={'y_obs': 'y_star'})

az.plot_bpv(idata_emcee, kind='t_stat', t_stat='mean',
            ax=axs[1], plot_mean=False,
            data_pairs={'y_obs': 'y_star'}, bpv=False)
axs[1].axvline(Y.mean(), ls="--",
               color="r", lw=2, label=r"$\bar Y$")
handles, labels = axs[1].get_legend_handles_labels()
handles.extend([Line2D([0], [0], label=r"Distribution of $\bar Y^*$")])
axs[1].legend(handles=handles)

# Show Bayesian p-values
for name, stat in statistics:
    bpv = utils.bpv(pp, Y, stat)
    print(f"bpv [T={name}]: {bpv:.3f}")
bpv [T=min]: 0.042
bpv [T=max]: 0.515
bpv [T=median]: 0.820
bpv [T=mean]: 0.509
bpv [T=std]: 0.504
In [23]:
az.plot_autocorr(idata_emcee, combined=True, var_names=theta_names,
                 grid=(NROWS(theta_ndim), NCOLS), labeller=theta_labeller)
print("Combined autocorrelation times:")
Combined autocorrelation times:

Out-of-sample predictions

In [24]:
# -- Generate and plot posterior predictive checks from X_test

# Posterior predictive checks
pp_test = utils.generate_pp(idata_emcee, X_test, theta_names, rng=rng)
idata_pp_test = utils.pp_to_idata(
    [pp_test], idata_emcee, ['y_star'], y_obs=Y_test)

fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(r"Posterior predictive checks on $X_{test}$")

utils.plot_ppc(idata_pp_test, n_samples=500, data_pairs={
               'y_obs': 'y_star'}, ax=axs[0])

az.plot_bpv(idata_pp_test, kind='t_stat', t_stat='mean', data_pairs={
            'y_obs': 'y_star'}, plot_mean=False, ax=axs[1], bpv=False)
axs[1].axvline(Y_test.mean(), ls="--",
               color="r", lw=2, label=r"$\bar Y_{test}$")
handles, labels = axs[1].get_legend_handles_labels()
handles.extend([Line2D([0], [0], label=r"Distribution of $\bar Y^*_{test}$")])
_ = axs[1].legend(handles=handles)

# Show Bayesian p-values
for name, stat in statistics:
    bpv = utils.bpv(pp_test, Y_test, stat)
    print(f"bpv [T={name}]: {bpv:.3f}")
Posterior predictive samples:   0%|          | 0/64 [00:00<?, ?it/s]
bpv [T=min]: 0.648
bpv [T=max]: 0.683
bpv [T=median]: 0.537
bpv [T=mean]: 0.538
bpv [T=std]: 0.617
In [25]:
# -- Compute metrics using several point estimates

df_metrics_emcee = pd.DataFrame(columns=results_columns)

# Posterior mean estimate
Y_hat_pp = pp_test[:, ::thin_pp, :].mean(axis=(0, 1))
metrics_pp = utils.regression_metrics(Y_test, Y_hat_pp)
df_metrics_emcee.loc[0] = [
    "emcee_posterior_mean",
    p_hat,
    metrics_pp["mse"],
    metrics_pp["rmse"],
    metrics_pp["r2"]
]

# Point estimates
for i, pe in enumerate(point_estimates):
    Y_hat_pe = utils.point_predict(
        X_test, idata_emcee,
        theta_names, pe)
    metrics_pe = utils.regression_metrics(Y_test, Y_hat_pe)
    df_metrics_emcee.loc[i + 1] = [
        "emcee_" + pe,
        p_hat,
        metrics_pe["mse"],
        metrics_pe["rmse"],
        metrics_pe["r2"]
    ]

df_metrics_emcee.sort_values(results_columns[-2], inplace=True)
df_metrics_emcee.style.hide_index()
Out[25]:
Estimator Features MSE RMSE $R^2$
emcee_median 3 0.511 0.715 0.997
emcee_mode 3 0.512 0.716 0.997
emcee_posterior_mean 3 0.523 0.723 0.997
emcee_mean 3 0.761 0.872 0.995
In [26]:
# -- Test variable selection procedure

df_metrics_emcee_var_sel = pd.DataFrame(columns=results_columns)

for pe in point_estimates:
    df_var_sel = bayesian_var_sel(
        idata_emcee, theta_space, theta_names, X_fd,
        Y, X_test_fd, Y_test, folds, prefix="emcee",
        point_est=pe)

    df_metrics_emcee_var_sel = df_metrics_emcee_var_sel.append(df_var_sel)

df_metrics_emcee_var_sel.sort_values(results_columns[-2], inplace=True)
df_metrics_emcee_var_sel.style.hide_index()
Out[26]:
Estimator Features MSE RMSE $R^2$
emcee_mode+sk_lasso 3 0.507 0.712 0.997
emcee_mode+sk_ridge 3 0.507 0.712 0.997
emcee_median+sk_lasso 3 0.516 0.718 0.997
emcee_median+sk_ridge 3 0.516 0.718 0.997
emcee_mean+sk_lasso 3 0.560 0.749 0.996
emcee_mean+sk_ridge 3 0.561 0.749 0.996
emcee_median+sk_svm_rbf 3 7.492 2.737 0.950
emcee_mean+sk_svm_rbf 3 10.028 3.167 0.933
emcee_mode+sk_svm_rbf 3 10.352 3.217 0.931

Save & Load

This is only for testing purposes; in a production environment one should use the Backends feature of emcee.

In [44]:
# -- Save

with open("emcee-p-fixed.idata", 'wb') as file:
    pickle.dump(idata_emcee, file)
In [45]:
# -- Load

with open("emcee-p-fixed.idata", 'rb') as file:
    idata_emcee = pickle.load(file)
    trace = idata_emcee.posterior.to_array().to_numpy().T
    trace_flat = trace.reshape(-1, trace.shape[-1])  # All chains combined

The PyMC library

In [27]:
import pymc3 as pm
import theano
import theano.tensor as tt

Model

In [28]:
# -- Probabilistic model

def make_model(theta_space, g, eta, X, Y, names, names_aux, mle_theta=None):
    n, N = X.shape
    grid = theta_space.grid
    p = theta_space.p

    if mle_theta is not None:
        b0 = mle_theta[:p]
    else:
        b0 = g*rng.standard_normal(size=p)  # <-- Change if needed

    with pm.Model() as model:
        X_pm = pm.Data('X', X)

        alpha0_and_log_sigma = pm.DensityDist(
            names_aux[0], lambda x: 0, shape=(2,))

        alpha0 = pm.Deterministic(names[-2], alpha0_and_log_sigma[0])

        log_sigma = alpha0_and_log_sigma[1]
        sigma = pm.math.exp(log_sigma)
        sigma2 = pm.Deterministic(names[-1], sigma**2)

        tau = pm.Uniform(names[1], 0.0, 1.0, shape=(p,))

        idx = np.abs(grid - tau[:, np.newaxis]).argmin(1)
        X_tau = X_pm[:, idx]
        G_tau = pm.math.matrix_dot(X_tau.T, X_tau)
        G_tau = (G_tau + G_tau.T)/2.  # Enforce symmetry
        G_tau_reg = G_tau + eta * \
            tt.max(tt.nlinalg.eigh(G_tau)[0])*np.identity(p)

        def beta_lprior(x):
            b = x - b0

            return (0.5*pm.math.logdet(G_tau_reg)
                    - p*log_sigma
                    - pm.math.matrix_dot(b.T, G_tau_reg, b)/(2.*g*sigma2))

        beta = pm.DensityDist(names[0], beta_lprior, shape=(p,))

        expected_obs = alpha0 + pm.math.matrix_dot(X_tau, beta)

        y_obs = pm.Normal('y_obs', mu=expected_obs, sigma=sigma, observed=Y)

    return model

Experiments

In [29]:
# -- Hyperparameters

burn = 0
thin = 1
thin_pp = 5

n_samples_nuts = 1000
tune_nuts = 1000
target_accept = 0.8
n_samples_metropolis = 10000
tune_metropolis = 5000

USE_NUTS = True
In [30]:
# -- Run sampler

model = make_model(theta_space, g, eta, X, Y, theta_names,
                   theta_names_aux[:1], mle_theta_tr)

with model:
    if USE_NUTS:
        idata_pymc = pm.sample(n_samples_nuts, cores=2,
                               tune=tune_nuts,
                               target_accept=target_accept,
                               return_inferencedata=True)
    else:
        step = pm.Metropolis()
        idata_pymc = pm.sample(n_samples_metropolis, cores=2,
                               tune=tune_metropolis, step=step,
                               return_inferencedata=True)

    idata_pymc = idata_pymc.sel(draw=slice(burn, None, thin))
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [β, τ, α0 and log σ]
100.00% [4000/4000 00:30<00:00 Sampling 2 chains, 0 divergences]
Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 30 seconds.
The number of effective samples is smaller than 25% for some parameters.

Analysis

Since the tuning iterations already serve as burn-in, we keep the whole trace. In addition, we could consider thinning the samples.

In [31]:
utils.summary(idata_pymc, var_names=theta_names, labeller=theta_labeller)
Out[31]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat mode median
β[0] 10.138 0.060 10.023 10.248 0.003 0.002 450.0 557.0 1.00 10.149 10.142
β[1] -4.903 0.071 -5.054 -4.790 0.005 0.004 265.0 129.0 1.01 -4.905 -4.907
β[2] -0.027 0.082 -0.191 0.117 0.005 0.004 256.0 218.0 1.01 -0.025 -0.028
τ[0] 0.800 0.003 0.795 0.805 0.000 0.000 493.0 935.0 1.00 0.797 0.800
τ[1] 0.100 0.003 0.095 0.105 0.000 0.000 303.0 765.0 1.01 0.095 0.100
τ[2] 0.486 0.280 0.000 0.928 0.016 0.012 270.0 419.0 1.00 0.448 0.473
$\alpha_0$ 5.271 0.066 5.149 5.388 0.004 0.003 321.0 504.0 1.00 5.248 5.271
$\sigma^2$ 0.429 0.063 0.323 0.549 0.003 0.002 380.0 490.0 1.02 0.417 0.422
In [32]:
az.plot_trace(idata_pymc, var_names=theta_names, labeller=theta_labeller)
print("Density and trace plot:")
Density and trace plot:
In [33]:
az.plot_posterior(
    idata_pymc, point_estimate='mode',
    var_names=theta_names,
    labeller=theta_labeller,
    textsize=20,
    grid=(NROWS(theta_ndim), NCOLS))
print("Marginal posterior distributions:")
Marginal posterior distributions:
In [34]:
# -- Generate and plot posterior predictive samples from X

with model:
    print("Generating posterior predictive samples...")
    pp = utils.generate_pp(idata_pymc, X, theta_names, rng=rng)
    utils.pp_to_idata([pp], idata_pymc, ['y_star'], merge=True)

# Posterior predictive checks
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(r"Posterior predictive checks on $X$")

utils.plot_ppc(idata_pymc, n_samples=500,
               ax=axs[0], data_pairs={"y_obs": "y_star"})

az.plot_bpv(idata_pymc, kind='t_stat', t_stat='mean',
            plot_mean=False, ax=axs[1],
            bpv=False, data_pairs={"y_obs": "y_star"})
axs[1].axvline(Y.mean(), ls="--",
               color="r", lw=2, label=r"$\bar Y$")
handles, labels = axs[1].get_legend_handles_labels()
handles.extend([Line2D([0], [0], label=r"Distribution of $\bar Y^*$")])
_ = axs[1].legend(handles=handles)

# Show Bayesian p-values
for name, stat in statistics:
    bpv = utils.bpv(pp, Y, stat)
    print(f"bpv [T={name}]: {bpv:.3f}")
Generating posterior predictive samples...
Posterior predictive samples:   0%|          | 0/2 [00:00<?, ?it/s]
bpv [T=min]: 0.020
bpv [T=max]: 0.529
bpv [T=median]: 0.825
bpv [T=mean]: 0.510
bpv [T=std]: 0.489
In [35]:
az.plot_autocorr(idata_pymc, var_names=theta_names,
                 combined=True, grid=(NROWS(theta_ndim), NCOLS),
                 labeller=theta_labeller)
print("Combined autocorrelation times:")
Combined autocorrelation times:
In [41]:
print("Graphical model:")
pm.model_graph.model_to_graphviz(model)
Graphical model:
Out[41]:
cluster100 x 100 100 x 100 cluster2 2 cluster3 3 cluster100 100 X X ~ Data β β ~ DensityDist X->β y_obs y_obs ~ Normal X->y_obs α0 and log σ α0 and log σ ~ DensityDist α0 α0 ~ Deterministic α0 and log σ->α0 σ2 σ2 ~ Deterministic α0 and log σ->σ2 α0 and log σ->β α0 and log σ->y_obs α0->y_obs σ2->β β->y_obs τ τ ~ Uniform τ->β τ->y_obs

Out-of-sample predictions

First we take a look at the distribution of predictions on a previously unseen dataset.

In [36]:
# -- Generate and plot posterior predictive samples from X_test

model_test = make_model(theta_space, g, eta, X_test, Y_test, theta_names,
                        theta_names_aux[:1], mle_theta)

with model_test:
    print("Generating posterior predictive on hold-out data...")
    pp_test = utils.generate_pp(idata_pymc, X_test, theta_names, rng=rng)
    idata_pp_test = utils.pp_to_idata(
        [pp_test], idata_pymc, ['y_star'], y_obs=Y_test)

# Posterior predictive checks
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(r"Posterior predictive checks on $X_{test}$")

utils.plot_ppc(idata_pp_test, n_samples=500,
               ax=axs[0], data_pairs={"y_obs": "y_star"})

az.plot_bpv(idata_pp_test, kind='t_stat', t_stat='mean',
            plot_mean=False, ax=axs[1],
            bpv=False, data_pairs={"y_obs": "y_star"})
axs[1].axvline(Y_test.mean(), ls="--",
               color="r", lw=2, label=r"$\bar Y_{test}$")
handles, labels = axs[1].get_legend_handles_labels()
handles.extend([Line2D([0], [0], label=r"Distribution of $\bar Y_{test}^*$")])
_ = axs[1].legend(handles=handles)

# Show Bayesian p-values
for name, stat in statistics:
    bpv = utils.bpv(pp_test, Y_test, stat)
    print(f"bpv [T={name}]: {bpv:.3f}")
Generating posterior predictive on hold-out data...
Posterior predictive samples:   0%|          | 0/2 [00:00<?, ?it/s]
bpv [T=min]: 0.628
bpv [T=max]: 0.697
bpv [T=median]: 0.520
bpv [T=mean]: 0.509
bpv [T=std]: 0.634

Next we look at the MSE when using several point-estimates for the parameters, as well as the mean of the posterior samples.

In [37]:
# -- Compute metrics using several point estimates

df_metrics_pymc = pd.DataFrame(columns=results_columns)

# Posterior mean estimate
Y_hat_pp = pp_test[:, ::thin_pp, :].mean(axis=(0, 1))
metrics_pp = utils.regression_metrics(Y_test, Y_hat_pp)
df_metrics_pymc.loc[0] = [
    "pymc_posterior_mean",
    p_hat,
    metrics_pp["mse"],
    metrics_pp["rmse"],
    metrics_pp["r2"]
]

# Point estimates
for i, pe in enumerate(point_estimates):
    Y_hat_pe = utils.point_predict(
        X_test, idata_pymc,
        theta_names, pe)
    metrics_pe = utils.regression_metrics(Y_test, Y_hat_pe)
    df_metrics_pymc.loc[i + 1] = [
        "pymc_" + pe,
        p_hat,
        metrics_pe["mse"],
        metrics_pe["rmse"],
        metrics_pe["r2"]
    ]

df_metrics_pymc.sort_values(results_columns[-2], inplace=True)
df_metrics_pymc.style.hide_index()
Out[37]:
Estimator Features MSE RMSE $R^2$
pymc_posterior_mean 3 0.496 0.705 0.997
pymc_mode 3 0.509 0.713 0.997
pymc_median 3 0.509 0.713 0.997
pymc_mean 3 0.510 0.714 0.997
In [38]:
# -- Test variable selection procedure

df_metrics_pymc_var_sel = pd.DataFrame(columns=results_columns)

for pe in point_estimates:
    df_var_sel = bayesian_var_sel(
        idata_pymc, theta_space, theta_names, X_fd,
        Y, X_test_fd, Y_test, folds, prefix="pymc",
        point_est=pe)

    df_metrics_pymc_var_sel = df_metrics_pymc_var_sel.append(df_var_sel)

df_metrics_pymc_var_sel.sort_values(
    results_columns[-2], inplace=True)
df_metrics_pymc_var_sel.style.hide_index()
Out[38]:
Estimator Features MSE RMSE $R^2$
pymc_mode+sk_lasso 3 0.511 0.714 0.997
pymc_mode+sk_ridge 3 0.511 0.715 0.997
pymc_median+sk_lasso 3 0.513 0.716 0.997
pymc_median+sk_ridge 3 0.513 0.716 0.997
pymc_mean+sk_lasso 3 0.515 0.718 0.997
pymc_mean+sk_ridge 3 0.515 0.718 0.997
pymc_mean+sk_svm_rbf 3 7.416 2.723 0.951
pymc_median+sk_svm_rbf 3 7.649 2.766 0.949
pymc_mode+sk_svm_rbf 3 10.349 3.217 0.931

Save & Load

In [42]:
# -- Save

_ = idata_pymc.to_netcdf("pymc-p-fixed.nc")
In [43]:
# -- Load

idata_pymc = az.from_netcdf("pymc-p-fixed.nc")

Notebook metadata

In [40]:
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Wed Dec 22 2021

Python implementation: CPython
Python version       : 3.9.9
IPython version      : 7.30.1

arviz     : 0.11.4
pymc3     : 3.11.4
autopep8  : 1.6.0
scipy     : 1.7.3
matplotlib: 3.5.1
skfda     : 0.0
numpy     : 1.20.3
sys       : 3.9.9 (main, Dec 22 2021, 14:49:36) 
[GCC 11.1.0]
theano    : 1.1.2
pandas    : 1.3.5
json      : 2.0.9
emcee     : 3.1.1
logging   : 0.5.1.2

Watermark: 2.2.0